Just want to do a little demonstration of what the bootstrapping process looks like and how the bootstrapped predictions from a logistic regression compare to the standard error lifted directly from the model. The first steps are just getting the environmental data, the pseudoabsences created and the models run.
ed <- raster::stack('C:/Users/thoval/Documents/Analyses/Data/lcm_had_elev_national_grid.gri')
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on Airy 1830 ellipsoid in CRS
## definition
names(ed)
## [1] "sea" "broad_wood" "conif_wood"
## [4] "arable" "impr_grass" "neutr_grass"
## [7] "calc_grass" "acid_grass" "fen_marsh_swamp"
## [10] "heather" "heath_grass" "bog"
## [13] "inland_rock" "saltwater" "freshwater"
## [16] "sup_lit_rock" "sup_lit_sed" "lit_rock"
## [19] "lit_sed" "saltmarsh" "urban"
## [22] "suburban" "AnnualTemp" "MeanDiRange"
## [25] "Isotherm" "TempSeasonality" "MaxTempWarmestMonth"
## [28] "MinTempColdestMonth" "TempAnnualRange" "MeanTempWetQuarter"
## [31] "MeanTempDriestQuarter" "MeanTempWarmQuarter" "MeanTempColdQuarter"
## [34] "AnnualPrecip" "PrecipWetMonth" "PrecipDriestMonth"
## [37] "PrecipSeasonality" "PrecipWettestQuarter" "PrecipDriestQuarter"
## [40] "PrecipWarmQuarter" "PrecipColdQuarter" "elevation_UK"
## [43] "slope" "aspect"
## big problem with slope and aspect!!!
ed <- dropLayer(ed, i = match(c("slope", "aspect"), names(ed)))
slope <- terrain(ed[[42]], opt = "slope", unit = "degrees")
aspect <- terrain(ed[[42]], opt = "aspect", unit = "degrees")
ed <- raster::stack(ed, slope, aspect)
names(ed)
## [1] "sea" "broad_wood" "conif_wood"
## [4] "arable" "impr_grass" "neutr_grass"
## [7] "calc_grass" "acid_grass" "fen_marsh_swamp"
## [10] "heather" "heath_grass" "bog"
## [13] "inland_rock" "saltwater" "freshwater"
## [16] "sup_lit_rock" "sup_lit_sed" "lit_rock"
## [19] "lit_sed" "saltmarsh" "urban"
## [22] "suburban" "AnnualTemp" "MeanDiRange"
## [25] "Isotherm" "TempSeasonality" "MaxTempWarmestMonth"
## [28] "MinTempColdestMonth" "TempAnnualRange" "MeanTempWetQuarter"
## [31] "MeanTempDriestQuarter" "MeanTempWarmQuarter" "MeanTempColdQuarter"
## [34] "AnnualPrecip" "PrecipWetMonth" "PrecipDriestMonth"
## [37] "PrecipSeasonality" "PrecipWettestQuarter" "PrecipDriestQuarter"
## [40] "PrecipWarmQuarter" "PrecipColdQuarter" "elevation_UK"
## [43] "slope" "aspect"
plot(ed[[43]], main = names(ed[[43]]), ylim=c(780000, 860000), xlim = c(60000, 200000))
plot(ed[[44]], main = names(ed[[44]]), ylim=c(780000, 860000), xlim = c(60000, 200000))
Plot number of data points in the whole of the UK and for the region of interest that I’ve chosen. Can see that Tyria jacobaeae is the most recorded species at both extents.
dfm_df <- read_csv("C:/Users/thoval/Documents/Analyses/DayFlyingMoths.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Warning: Duplicated column names deduplicated: 'X1' => 'X1_1' [2]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## TO_STARTDATE = col_character(),
## TO_ENDDATE = col_character(),
## TO_GRIDREF = col_character(),
## TO_LOCALITY = col_character(),
## SQ_10 = col_character(),
## CONCEPT = col_character(),
## date = col_date(format = ""),
## sp_n = col_character(),
## com_n = col_character(),
## ig = col_character(),
## ag = col_character(),
## id = col_character()
## )
## i Use `spec()` for the full column specifications.
head(dfm_df)
## # A tibble: 6 x 25
## X1 X1_1 TO_ID TO_STARTDATE TO_ENDDATE DT_ID TO_GRIDREF TO_LOCALITY GR_ID
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 1 165 1410 13/07/2005 13/07/2005 1 NM430268 Isle of Mu~ 1
## 2 2 166 1411 13/07/2005 13/07/2005 1 NM430268 Isle of Mu~ 1
## 3 3 1466 6557 02/07/2005 02/07/2005 1 SK322596 Ash Brook,~ 1
## 4 4 1937 10384 16/07/2002 16/07/2002 1 SV918113 St Mary's ~ 1
## 5 5 2393 13520 21/07/2004 21/07/2004 1 SK440660 Holmewood ~ 1
## 6 6 2394 13521 21/07/2004 21/07/2004 1 SK440660 Holmewood ~ 1
## # ... with 16 more variables: VC_ID <dbl>, SQ_10 <chr>, CONCEPT <chr>,
## # TO_PRECISION <dbl>, LATITUDE <dbl>, LONGITUDE <dbl>, lon <dbl>, lat <dbl>,
## # date <date>, yr <dbl>, jd <dbl>, sp_n <chr>, com_n <chr>, ig <chr>,
## # ag <chr>, id <chr>
# get eastings and northings
dfm_df <- c_en(dfm_df)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum OSGB 1936 in CRS definition
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on Airy 1830 ellipsoid in CRS
## definition
dfm_df %>% group_by(sp_n) %>% tally %>%
ggplot(aes(x = reorder(sp_n, n), y = n)) +
geom_point() +
theme(axis.text.x = element_text(angle = 50, hjust=1)) +
xlab("species") + ggtitle("All UK") +
ylab("Number of sightings")
sp_y <- subset(dfm_df, lat > 53.1 & lat <= 54.4 &
lon > -3.9 & lon < 0.2) %>%
mutate(species = sp_n,
year = yr)
sp_y %>% group_by(sp_n) %>% tally %>%
ggplot(aes(x = reorder(sp_n, n), y = n)) +
geom_point() +
theme(axis.text.x = element_text(angle = 50, hjust=1)) +
xlab("species") + ggtitle("Subset UK, 53.1 < latitude <= 54.4") +
ylab("Number of sightings")
Generate the pseudoabsences.
#### Create presence/absence
spp <- unique(sp_y$species)
# as a hack to not have to create a new function for east and north
# create a data frame that has East and north renamed as lon lat
ndf <- sp_y %>% mutate(lon = EASTING,
lat = NORTHING) %>%
dplyr::select(-EASTING, -NORTHING)
registerDoParallel(cores = detectCores() - 1)
system.time(
ab1 <- foreach(i = 1 : length(spp)) %dopar% {
cpa(spdat = ndf, species = spp[i], matchPres = TRUE,
minYear = 2000, maxYear = 2017, recThresh = 5)
}
)
## user system elapsed
## 0.11 0.15 0.78
registerDoSEQ()
names(ab1) <- spp
Parallelised lr sdm with 10 k-fold validations to get bootstrapped errors. <10 minute run time
Set the K value to the number of bootstraps that you want for the model. This randomly samples the data and fits a model k times. The models are stored in the output object and so then can be used to predict the probability of presence.
# do all species in parallel to save time
registerDoParallel(cores = detectCores() - 1)
names(ab1) <- spp
system.time(
spp_lr_out <- foreach(s = 1:length(spp), #.combine='comb', .multicombine=TRUE,
# .init=list(list()),
.packages = c('tidyverse', 'raster', 'readr', 'dismo'),
.errorhandling = 'pass') %dopar% {
# print(paste(s, spp[s], sep = " "))
if(is.null(ab1[s])){
# print(paste("No data for species", spp[s]))
next
}
sdm_lr <- fsdm(species = spp[s], model = "lr",
climDat = ht, spData = ab1, knots = -1,
k = 10,
prediction = T,
inters = F,
write = F, outPath = "C:/Users/thoval/Documents/Analyses/lr_outs/")
se_out <- predict(ht, sdm_lr$Model, type = "response", se.fit = TRUE, index = 2)
# save(se_out, file = paste0("C:/Users/thoval/Documents/Analyses/lr_outs/", spp[s], "_lr_se_error_prediction.rdata"))
list(sdm_lr, se_out)
}
)
## user system elapsed
## 16.11 27.42 588.76
registerDoSEQ()
The output of this function is a bit odd as it has a list within another list. The first list is each separate species, and the second list is the output from the fsdm function [[1]] and the standard error [[2]]. Obviously, with models that do not have a standard error prediction there will be no entry in the second list.
At the moment, just a regular for loop that predicts the probability of presence for the whole region of interest which makes it slow. The predict function has an extent argument meaning that we can predict for smaller subsets of the overall distribution of a species.
First let’s choose the extent to project to so that we can reduce processing time and get a more detailed view.
ext <- extent(matrix(c(340000,450000, 400000,490000), ncol = 2))
plot(spp_lr_out[[2]][[1]]$Predictions, main = spp_lr_out[[2]][[1]]$Species)
points(x = spp_lr_out[[2]][[1]]$Data$lon[spp_lr_out[[2]][[1]]$Data$val == 1],
spp_lr_out[[2]][[1]]$Data$lat[spp_lr_out[[2]][[1]]$Data$val == 1],
cex = 0.6, col = "red", pch = 20)
plot(ext, add = T)
test_area <- crop(spp_lr_out[[2]][[1]]$Predictions, ext)
plot(test_area, main = paste(spp_lr_out[[2]][[1]]$Species, 'cropped'))
Now predict from each of the models that were generated on a subset of the data. This takes a while but can almost certainly be sped up. Just some notes to myself here really…
There is probably a way to speed this up but I need to figure out how parallelise nested for loops which is tricky because of the needed to catch and skip (known) errors.
I think the way to do this is to change the second for loop to be s in 1:20 (i.e. number of bootstrapped models rather than the length of the object itself). This is because the errors form because some of the spp_lr_out[[]] objects are null so can’t get the length of an object like that. Then use the %:% and %dopar% functions to send it to multiple cores.
spp_out_boot <- list()
# beginCluster(n = 7)
system.time(
for(s in 1:length(spp_lr_out)){
# some entries in the list are NULL because of too few data points in the kfold cross validation part of the model above
skip_to_next <- FALSE
# find the error
tryCatch({
bt_mods_loc <- spp_lr_out[[s]][[1]]$Bootstrapped_models
},
error = function(e) {skip_to_next <<- TRUE})
# skip to next if error
if(skip_to_next){print(paste("SKIPPING LIST ENTRY", s))
next}
print(paste(s, spp_lr_out[[s]][[1]]$Species))
pred_out_boot <- list()
for(b in 1:length(spp_lr_out[[s]][[1]]$Bootstrapped_models)) {
# print(b)
# predict from each bootsrapped model
b_t <- spp_lr_out[[s]][[1]]$Bootstrapped_models[[b]]
# non-parallel
p_t <- suppressWarnings(predict(ht, b_t, index = NULL, type = 'response', ext = ext))
# parallel
# p_t <- clusterR(ht, predict, args = list(b_t, type = 'response', index = NULL, ext = ext), export='ext')
# store
pred_out_boot[[b]] <- p_t
}
# stack all the predictions and store as outputs
boots <- do.call("stack", pred_out_boot)
spp_out_boot[[spp_lr_out[[s]][[1]]$Species]] <- boots
}
)
## [1] "1 Tyria jacobaeae"
## [1] "2 Zygaena filipendulae"
## [1] "3 Zygaena lonicerae"
## [1] "4 Chiasmia clathrata"
## [1] "5 Odezia atrata"
## [1] "6 Ematurga atomaria"
## [1] "7 Perizoma albulata"
## [1] "8 Archiearis parthenias"
## [1] "9 Euclidia mi"
## [1] "10 Pseudopanthera macularia"
## [1] "11 Anarta myrtilli"
## [1] "12 Parasemia plantaginis"
## [1] "13 Panemeria tenebrata"
## [1] "14 Adscita statices"
## [1] "15 Scotopteryx bipunctaria"
## [1] "16 Lycia zonaria"
## [1] "17 Orgyia antiqua"
## [1] "18 Adscita geryon"
## [1] "19 Euclidia glyphica"
## [1] "SKIPPING LIST ENTRY 20"
## [1] "21 Orgyia recens"
## [1] "22 Photedes captiuncula"
## [1] "23 Phytometra viridaria"
## [1] "24 Idaea muricata"
## [1] "25 Rheumaptera hastata"
## [1] "26 Epirrhoe tristata"
## [1] "27 Hemaris fuciformis"
## [1] "SKIPPING LIST ENTRY 28"
## [1] "SKIPPING LIST ENTRY 29"
## [1] "SKIPPING LIST ENTRY 30"
## user system elapsed
## 196.58 52.22 252.81
# endCluster()
This sometimes throws an error because of memory allocation (cannot allocate vector size xxxMB) when using larger extents.
The quantile/range function takes a long time when projecting to a large area, with the extent that we decided on above it’s not too bad (~<1min).
This block of code calculates the 0.05 and 0.95 quantile in each cell of each of the bootstrapped predictions, subtracts the lower from the upper and then plots the result.
# registerDoParallel(cores = detectCores() - 1)
zl_q <- calc(spp_out_boot$`Zygaena filipendulae` , fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)})
# system.time(
# zl_q <- clusterR(spp_out_boot$`Zygaena filipendulae`, calc,
# agrs = list(fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)}))
# )
#
# registerDoSEQ()
zl_q
## class : RasterBrick
## dimensions : 400, 600, 240000, 2 (nrow, ncol, ncell, nlayers)
## resolution : 100, 100 (x, y)
## extent : 340000, 4e+05, 450000, 490000 (xmin, xmax, ymin, ymax)
## crs : +proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +ellps=airy +units=m +no_defs
## source : memory
## names : X5., X95.
## min values : 0.04783858, 0.06358882
## max values : 0.9670543, 1.0000000
zl_q_25_75 <- zl_q[[2]]-zl_q[[1]]
# names(ab1) # zygaena filip is second in the list
par(mfrow = c(1,2))
plot(test_area, main = paste(spp_lr_out[[2]][[1]]$Species, "distribution"))
# points(x = spp_lr_out[[2]][[1]]$Data$lon[spp_lr_out[[2]][[1]]$Data$val == 1],
# spp_lr_out[[2]][[1]]$Data$lat[spp_lr_out[[2]][[1]]$Data$val == 1],
# cex = 0.6, col = "red", pch = 20)
plot(zl_q_25_75, main = "95% - 5% quantile")
points(x = spp_lr_out[[2]][[1]]$Data$lon[spp_lr_out[[2]][[1]]$Data$val == 1],
spp_lr_out[[2]][[1]]$Data$lat[spp_lr_out[[2]][[1]]$Data$val == 1],
cex = 0.6, col = "red", pch = 20)
par(mfrow = c(1,1))
Now let’s do the same for all of the species in the bootstrapped outputs.
length(spp_out_boot)
## [1] 26
# boot_out <- vector(mode = "list", length = length(spp_out_boot))
#
# # beginCluster(7)
#
# for(bs in 1:length(spp_out_boot)){
# print(bs)
#
# sp_bts <- spp_out_boot[[bs]]
#
#
# # zl_q <- calc(spp_out_boot$`Zygaena lonicerae` , fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)})
# # sp_q <- clusterR(sp_bts, calc,
# # agrs = list(fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)}))
#
# sp_q <- calc(sp_bts, fun = function(x) {quantile(x, probs = c(0.05, 0.95),
# na.rm = T)})
#
# sp_q_25_75 <- sp_q[[2]]-sp_q[[1]]
#
# names(sp_q_25_75) <- names(spp_out_boot)[bs]
#
# boot_out[[bs]] <- sp_q_25_75
#
# }
#
# # endCluster()
registerDoParallel(cores = detectCores() - 1)
system.time(
boot_out <- foreach(bs = 1:length(spp_out_boot), #.combine='comb', .multicombine=TRUE,
# .init=list(list()),
.packages = c('raster'),
.errorhandling = 'pass') %dopar% {
sp_bts <- spp_out_boot[[bs]]
# zl_q <- calc(spp_out_boot$`Zygaena lonicerae` , fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)})
# sp_q <- clusterR(sp_bts, calc,
# agrs = list(fun = function(x) {quantile(x, probs = c(0.05, 0.95), na.rm = T)}))
sp_q <- calc(sp_bts, fun = function(x) {quantile(x, probs = c(0.05, 0.95),
na.rm = T)})
sp_q_25_75 <- sp_q[[2]]-sp_q[[1]]
names(sp_q_25_75) <- names(spp_out_boot)[bs]
sp_q_25_75
})
## user system elapsed
## 3.72 2.81 298.64
registerDoSEQ()
Plot all of the species together. It’s important to remember that these plots are still from logistic regression models so some of the predictions aren’t great. For the species that have relatively few data points, you can see that there is a lot of variation in the bootstrapped predictions. I don’t think this would be a problem for the adaptive sampling because the variation layer would just be a lot less important than the distance from any other point or the probability of presence layer. One of the key benefits of having a bootstrapped variation/error layer is that it is bound between 0 and 1 which means that we won’t get the figures skewed by certain areas of really high uncertainty (but this might also be a drawback of course if these are the areas that would benefit most from extra sampling).
#
#
# length(boot_out)
#
par(mfrow = c(1,3))
for(j in c(1:26)){
if(length(spp_out_boot)>26){
plot(spp_lr_out[[j]][[1]]$Predictions, main = spp_lr_out[[j]][[1]]$Species,
xlim = c(340000, 400000),
ylim = c(450000, 490000))
plot(spp_lr_out[[j]][[2]], main = paste(spp_lr_out[[j]][[1]]$Species, 'Standard error'),
xlim = c(340000, 400000),
ylim = c(450000, 490000))
plot(boot_out[[j]], main = paste(names(boot_out[[j]]), '95% - 5% quantile'))
} else if(length(spp_out_boot)==26){
if(j<20){
plot(spp_lr_out[[j]][[1]]$Predictions, main = spp_lr_out[[j]][[1]]$Species,
xlim = c(340000, 400000),
ylim = c(450000, 490000))
plot(spp_lr_out[[j]][[2]], main = paste(spp_lr_out[[j]][[1]]$Species, 'Standard error'),
xlim = c(340000, 400000),
ylim = c(450000, 490000))
plot(boot_out[[j]], main = paste(names(boot_out[[j]]), '95% - 5% quantile'))
} else if(j>=20){
plot(spp_lr_out[[j+1]][[1]]$Predictions, main = spp_lr_out[[j+1]][[1]]$Species,
xlim = c(340000, 400000),
ylim = c(450000, 490000))
plot(spp_lr_out[[j+1]][[2]], main = paste(spp_lr_out[[j+1]][[1]]$Species, 'Standard error'),
xlim = c(340000, 400000),
ylim = c(450000, 490000))
plot(boot_out[[j]], main = paste(names(boot_out[[j]]), '95% - 5% quantile'))
}
}
}